Using SQL and Plotly for More Advanced Data Visualizations

news
code
analysis
Author

Owen Sun

Published

January 27, 2024

url:https://owen020215.github.io/newhw/posts/hw1/

1. Creating a Climate Data Database

Explanation: Our first step is to construct a database with three critical components: temperatures, stations, and countries. Each table plays a unique role:

  • Temperatures: Records of temperature readings.

  • Stations: Geographic and descriptive details of the stations where readings are taken.

  • Countries: Information linking stations to their respective countries.

We use SQL for database management, ensuring efficient data retrieval and organization. This structure allows us to access and analyze climate data effectively.

import sqlite3
import pandas as pd
# No need to import Plotly Express and sklearn here since they're not used in the provided code snippet.

# Establish a connection to the SQLite database.
conn = sqlite3.connect("weather.db")

# Function to prepare the DataFrame for further processing or analysis.
def prepare_df(df):
    """
    Prepares the temperature DataFrame by setting a multi-level index, reshaping, 
    and cleaning the data for analysis.
    
    Parameters:
    df (pd.DataFrame): The original DataFrame loaded from a CSV file containing 
                       temperature data with an ID, Year, and monthly temperatures.
    
    Returns:
    pd.DataFrame: A DataFrame with the index reset and columns for ID, Year, Month, 
                  and Temperature, where 'Month' is extracted from the original 
                  column names and 'Temp' is adjusted to represent actual temperatures.
    """
    # Set a multi-level index using 'ID' and 'Year', then stack the DataFrame to 
    # collapse the month columns into a single column.
    df = df.set_index(keys=["ID", "Year"])
    df = df.stack()
    
    # Reset the index to turn the indices into columns and rename the resulting columns.
    df = df.reset_index()
    df = df.rename(columns={"level_2": "Month", 0: "Temp"})
    
    # Convert month column values from string to integer and adjust temperature values.
    df["Month"] = df["Month"].str[5:].astype(int)
    df["Temp"] = df["Temp"] / 100
    
    return df

# Load temperature data, prepare it using the defined function, and write to the SQLite database.
df = pd.read_csv("temps.csv")
df = prepare_df(df)
df.to_sql("temperatures", conn, if_exists='replace', index=False)

# Load country codes and station metadata, then write them to the SQLite database.
df = pd.read_csv("country-codes.csv")
df.to_sql("countries", conn, if_exists='replace', index=False)

df = pd.read_csv("station-metadata.csv")
df.to_sql("stations", conn, if_exists='replace', index=False)

# Close the database connection.
conn.close()
conn = sqlite3.connect('weather.db')
cursor = conn.cursor()

# Query to list all tables
cursor.execute("PRAGMA table_info('temperatures');")
columns = cursor.fetchall()

# Print the column names
print("Column names in 'temperatures' table:")
for col in columns:
    print(col[1])  # Column name is in the second position

conn.close()
Column names in 'temperatures' table:
ID
Year
Month
Temp

2. Writing a Query Function

Explanation: To extract meaningful information from our database, we’ll write a function query_climate_database() in Python. This function is designed to retrieve temperature readings for a specified country, within a given date range and month. It returns a Pandas dataframe, making data manipulation and analysis in Python straightforward. Here, we focus on clarity and efficiency, using Python’s f-strings for cleaner SQL queries.

from climate_database import query_climate_database
import inspect

# Use the function
df = query_climate_database(db_file='weather.db', country="India", year_begin=1980, year_end=2020, month=1)
print(df)

# To print the source code of the function
print(inspect.getsource(query_climate_database))
               NAME  LATITUDE  LONGITUDE Country  Year  Month   Temp
0     PBO_ANANTAPUR    14.583     77.633   India  1980      1  23.48
1     PBO_ANANTAPUR    14.583     77.633   India  1981      1  24.57
2     PBO_ANANTAPUR    14.583     77.633   India  1982      1  24.19
3     PBO_ANANTAPUR    14.583     77.633   India  1983      1  23.51
4     PBO_ANANTAPUR    14.583     77.633   India  1984      1  24.81
...             ...       ...        ...     ...   ...    ...    ...
3147     DARJEELING    27.050     88.270   India  1983      1   5.10
3148     DARJEELING    27.050     88.270   India  1986      1   6.90
3149     DARJEELING    27.050     88.270   India  1994      1   8.10
3150     DARJEELING    27.050     88.270   India  1995      1   5.60
3151     DARJEELING    27.050     88.270   India  1997      1   5.70

[3152 rows x 7 columns]
def query_climate_database(db_file, country, year_begin, year_end, month):
    
    # Open a new connection to the specified database file
    conn = sqlite3.connect(db_file)
    

    cmd = f"""
    SELECT 
        S.NAME, 
        S.LATITUDE, 
        S.LONGITUDE, 
        C.NAME as Country, 
        T.Year, 
        T.Month, 
        T.Temp
    FROM 
        stations S
    INNER JOIN 
        temperatures T ON T.ID = S.ID
    INNER JOIN 
        countries C ON S.ID LIKE C.'FIPS 10-4' || '%'
    WHERE 
        C.NAME = ? AND 
        T.Year BETWEEN ? AND ? AND 
        T.Month = ? AND
        T.Temp IS NOT NULL
    """
    
    # Execute the query and store the results in a pandas DataFrame
    df = pd.read_sql_query(cmd, conn, params=(country, year_begin, year_end, month))
    
    # Close the database connection
    conn.close()
    
    return df
def query_climate_database(db_file, country, year_begin, year_end, month):
    """
    Queries the climate database for temperature data for a specific country, within a specified 
    range of years and month, and returns the data as a pandas DataFrame.

    Parameters:
    - db_file (str): The file path to the SQLite database containing the climate data.
    - country (str): The name of the country for which to query temperature data.
    - year_begin (int): The beginning year of the period for the query.
    - year_end (int): The end year of the period for the query.
    - month (int): The month for which to query temperature data.

    Returns:
    pd.DataFrame: A DataFrame containing the station name, latitude, longitude, country name, 
                  year, month, and temperature data for the specified parameters. Each row in the 
                  DataFrame represents a unique record from the database matching the query criteria.

    The query joins three tables: stations, temperatures, and countries, filtering the results 
    by the specified country, year range, and month. It returns only records where the temperature 
    data is not null.
    """
    # Open a new connection to the specified database file
    conn = sqlite3.connect(db_file)
    
    # SQL command to retrieve the data
    cmd = f"""
    SELECT 
        S.NAME, 
        S.LATITUDE, 
        S.LONGITUDE, 
        C.NAME as Country, 
        T.Year, 
        T.Month, 
        T.Temp
    FROM 
        stations S
    INNER JOIN 
        temperatures T ON T.ID = S.ID
    INNER JOIN 
        countries C ON S.ID LIKE C.'FIPS 10-4' || '%'
    WHERE 
        C.NAME = ? AND 
        T.Year BETWEEN ? AND ? AND 
        T.Month = ? AND
        T.Temp IS NOT NULL
    """
    
    # Execute the query and store the results in a pandas DataFrame
    df = pd.read_sql_query(cmd, conn, params=(country, year_begin, year_end, month))
    
    # Close the database connection
    conn.close()
    
    return df



query_climate_database(db_file='weather.db',country = "India", 
                       year_begin = 1980, 
                       year_end = 2020,
                       month = 1)
NAME LATITUDE LONGITUDE Country Year Month Temp
0 PBO_ANANTAPUR 14.583 77.633 India 1980 1 23.48
1 PBO_ANANTAPUR 14.583 77.633 India 1981 1 24.57
2 PBO_ANANTAPUR 14.583 77.633 India 1982 1 24.19
3 PBO_ANANTAPUR 14.583 77.633 India 1983 1 23.51
4 PBO_ANANTAPUR 14.583 77.633 India 1984 1 24.81
... ... ... ... ... ... ... ...
3147 DARJEELING 27.050 88.270 India 1983 1 5.10
3148 DARJEELING 27.050 88.270 India 1986 1 6.90
3149 DARJEELING 27.050 88.270 India 1994 1 8.10
3150 DARJEELING 27.050 88.270 India 1995 1 5.60
3151 DARJEELING 27.050 88.270 India 1997 1 5.70

3152 rows × 7 columns

Example and Explanation of query_climate_database():

Code Explanation: Let’s consider an example where we query temperature data for India from 1980 to 2020 for the month of January. The output will be a neatly organized dataframe showing station names, coordinates, country, year, month, and average temperature.

3. Geographic Scatter Function for Yearly Temperature Increases

Explanation: Our next goal is to visualize how average yearly temperatures change within a country. For this, we create the temperature_coefficient_plot() function. This function generates an interactive scatterplot showing temperature changes at different stations. We’ll use Plotly Express for visualization, enabling us to create dynamic, informative maps. The color intensity of each point on the map indicates the degree of temperature change, providing an immediate visual understanding of climate trends.

from sklearn.linear_model import LinearRegression
import pandas as pd
import plotly.express as px

def temperature_coefficient(db_file, country, year_begin, year_end, month, min_obs):
    """
    Calculates the temperature change coefficient for each station in a specified country, 
    within a given time period and month, filtering out stations with observations below a minimum threshold.

    Parameters:
    - db_file (str): Path to the SQLite database file containing climate data.
    - country (str): Name of the country for which the temperature coefficient is to be calculated.
    - year_begin (int): Starting year for the period of interest.
    - year_end (int): Ending year for the period of interest.
    - month (int): The month for which the data is to be analyzed.
    - min_obs (int): Minimum number of observations required for a station to be included in the analysis.

    Returns:
    pd.DataFrame: A DataFrame with the original data plus a 'change' column representing the temperature change coefficient
                  for each station that meets the observation threshold. The coefficient reflects the estimated yearly 
                  temperature change based on a linear regression model.
    """
    df = query_climate_database(db_file, country, year_begin, year_end, month)
    value_counts = (df["NAME"].value_counts() >= min_obs) 
    new_value_counts = value_counts[value_counts==True]
    new_df = df[df['NAME'].isin(new_value_counts.index)].copy()
    new_df["change"] = new_df["NAME"]
    
    for i in new_df['NAME'].unique():
        df2 = new_df[new_df["NAME"]==i].copy()
        X = df2[["Year"]].copy()
        y = df2['Temp'].copy()
        model = LinearRegression()
        model.fit(X, y)
        first_coefficient = model.coef_[0]
        new_df.loc[new_df["NAME"]==i, "change"] = first_coefficient
        
    return new_df

def temperature_coefficient_plot(db_file, country, year_begin, year_end, month, min_obs, **kwargs):
    """
    Generates a scatter map plot showing the estimated yearly increase in temperature for each station in a specified country,
    within a given time period and month. Stations with observations below a specified minimum are excluded.

    Parameters:
    - db_file (str): Path to the SQLite database file containing climate data.
    - country (str): Name of the country for which the temperature change plot is to be generated.
    - year_begin (int): Starting year for the period of interest.
    - year_end (int): Ending year for the period of interest.
    - month (int): The month for which the data is to be visualized.
    - min_obs (int): Minimum number of observations required for a station to be included in the analysis.
    - **kwargs: Additional keyword arguments passed to `px.scatter_mapbox`.

    Returns:
    plotly.graph_objs._figure.Figure: A Plotly Figure object representing the scatter map plot of the estimated yearly temperature 
                                      increase for each qualifying station. The color scale indicates the magnitude of the 
                                      temperature change.
    """
    df = temperature_coefficient(db_file, country, year_begin, year_end, month, min_obs)
    df["Estimated Yearly Increase"] = pd.to_numeric(df["change"]).round(4)
    fig = px.scatter_mapbox(df, 
                            lat="LATITUDE",
                            lon="LONGITUDE", 
                            hover_name="NAME", 
                            color="Estimated Yearly Increase",
                            color_continuous_midpoint=0,
                            title=f"Estimate of Yearly Change in Temperature of Stations in {country} in month {month} from {year_begin} to {year_end}",
                            **kwargs)
    return fig
import plotly.express as px
color_map = px.colors.diverging.RdGy_r # choose a colormap

fig = temperature_coefficient_plot('weather.db', "India", 1980, 2020, 1, 
                                   min_obs=10,
                                   zoom=2,
                                   mapbox_style="carto-positron",
                                   color_continuous_scale=color_map)


fig.show()

Example of Using temperature_coefficient_plot():

Code Explanation: Imagine creating a plot for India, showing temperature changes in January from 1980 to 2020. The plot not only reveals geographical patterns in temperature changes but also provides detailed information on hover, like station names and precise temperature change values

import plotly.express as px
color_map = px.colors.diverging.RdGy_r # choose a colormap

fig = temperature_coefficient_plot('weather.db', "United States", 1970, 1990, 2, 
                                   min_obs = 10,
                                   zoom = 2,
                                   mapbox_style="carto-positron",
                                   color_continuous_scale=color_map)

fig.show()

Imagine creating a plot for India, showing temperature changes in January from 1980 to 2020.

4. Additional Interesting Figures

Explanation: To further explore our climate data, we’ll create two additional types of interactive visualizations. Each will address a unique question and provide different insights into the dataset.

We want to first answer the question: Which station in which year and month had the lowest average temperature?

import sqlite3
import pandas as pd

database_path = 'weather.db'
conn = sqlite3.connect(database_path)

def query_climate_database2(limit=10):
    """
    Queries the climate database to retrieve a specified number of records with the lowest temperatures,
    along with the corresponding country code, year, and month.

    The function executes a SQL query to select the top records with the lowest temperatures from
    the 'temperatures' table, ordering the results in ascending order by temperature. It returns a subset
    of information including the country code derived from the first two characters of the station ID,
    the temperature, year, and month.

    Parameters:
    - limit (int): The number of records to retrieve, defaulting to 10. Specifies the limit for the query
                   to restrict the number of rows returned, focusing on the lowest temperatures.

    Returns:
    pd.DataFrame: A DataFrame containing the query results with columns for the country code, temperature,
                  year, and month. This DataFrame is sorted by temperature in ascending order, showcasing
                  the records with the lowest temperatures up to the specified limit.

    Note: This function assumes an active database connection 'conn' is available and correctly configured
          to query from the 'temperatures' table.
    """
    
    cmd = f"""
    SELECT SUBSTR(id, 1, 2) AS country, temp, year, month
    FROM temperatures
    ORDER BY temp ASC LIMIT {limit}
    """

    df = pd.read_sql_query(cmd, conn)
    return df

# Example call to the function
query_climate_database2(10)
country Temp Year Month
0 AY -75.00 1987 8
1 AY -73.80 1983 7
2 AY -73.55 1978 8
3 AY -72.89 1967 8
4 AY -72.83 2019 6
5 AY -72.80 1975 8
6 AY -72.80 1997 7
7 AY -72.35 1982 8
8 AY -72.19 1979 7
9 AY -71.87 1998 9

Then, we want to visualize our response to the question: Which station in which year and month had the lowest average temperature?

import plotly.express as px

def temperature_coefficient_plot(n=100, **kwargs):
    """
    Generates a 3D scatter plot visualizing the n lowest temperatures recorded, showing their distribution across different years and months, and color-coded by country. This function queries the climate database for the n lowest temperatures and their corresponding year and month of occurrence, along with the country codes. It then uses Plotly Express to create a 3D scatter plot of these data points, with temperature on the x-axis, year on the y-axis, and month on the z-axis. Points are color-coded by country to provide visual differentiation between different countries' data. Parameters: - n (int): The number of records to retrieve and visualize, focusing on the lowest temperatures. Defaults to 100 if not specified. - **kwargs: Additional keyword arguments to pass to the Plotly Express scatter_3d function for customizing the plot. This can include arguments such as marker size, labels, and color scale options. Returns: plotly.graph_objs._figure.Figure: A Plotly Figure object representing the 3D scatter plot. The figure can be displayed in Jupyter notebooks or saved to an HTML file using Plotly's built-in functions. Note: This function assumes that a function `query_climate_database2` is defined and accessible, which is responsible for querying the database to retrieve the specified number of records with the lowest temperatures, along with their year, month, and country code.
    """
    df = query_climate_database2(n)

    fig = px.scatter_3d(df,
                        x="Temp",
                        y="Year",
                        z="Month",
                        color="country",
                        opacity=0.5,
                        title=f"Year and Month of the {n} Lowest Temperature",
                        **kwargs)
    
    return fig

# Example call to the function
# Note: Ensure that query_climate_database2 function is defined and accessible in your script.
temperature_coefficient_plot(150)

The function is designed to visualize temperature data, showing the relationship between the year, temperature, and potentially other variables.

import plotly.express as px

def temperature_coefficient_plot_2d(n=100, facet_by='Month', **kwargs):
  
    # Assuming query_climate_database2(n) returns a DataFrame with the necessary data
    
    df = query_climate_database2(n)

    # Create a 2D scatter plot faceted by the specified variable (e.g., 'Month')
    fig = px.scatter(df,
                     x="Year",
                     y="Temp",
                     color="country",
                     facet_col=facet_by,  # Faceting by 'Month' or another variable
                     opacity=0.5,
                     title=f"Year and Month of the {n} Lowest Temperatures",
                     **kwargs)
    
    # Adjust layout for readability
    fig.update_layout(autosize=True)
    fig.update_traces(marker=dict(size=5))
                            
    return fig

# Example call to the modified function
fig = temperature_coefficient_plot_2d(150, facet_by='Month')
fig.show()

When compare the highest temperatures between 2 countries, which country has the higher temperature?

database_path = 'weather.db'
conn = sqlite3.connect(database_path)
import plotly.express as px
def temperature_coefficient_plot(country1, country2, n = 100, **kwargs):
    cmd = \
    f"""
    SELECT SUBSTRING(id,1,2) AS country, temp
    FROM temperatures
    WHERE country = "{country1}"
    ORDER BY temp DESC LIMIT {n}
    """

    df = pd.read_sql_query(cmd, conn)
    cmd = \
    f"""
    SELECT SUBSTRING(id,1,2) AS country, temp
    FROM temperatures
    WHERE country = "{country2}"
    ORDER BY temp DESC LIMIT {n}
    """

    df1 = pd.read_sql_query(cmd, conn)
    df = pd.concat([df,df1])
    fig = px.box(df, 
             "Temp",
             color = "country",
             width = 600,
             height = 300,
             title = f"Comparing highest {n} temperatures of {country1} versus {country2}")
    
                            
    return fig

temperature_coefficient_plot("US","CA")